In this homework, we will use the NCI60 cancer cell line microarray data, which consist of 6,830 gene expression measurements on 64 cancer cell lines. You need to install the ISLR package in R by
library(ISLR)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(hyperSpec)
## Loading required package: lattice
## Loading required package: grid
## Loading required package: xml2
## Package hyperSpec, version 0.100.0
##
## To get started, try
## vignette ("hyperspec")
## package?hyperSpec
## vignette (package = "hyperSpec")
##
## If you use this package please cite it appropriately.
## citation("hyperSpec")
## will give you the correct reference.
##
## The project homepage is http://hyperspec.r-forge.r-project.org
##
## Attaching package: 'hyperSpec'
## The following object is masked from 'package:dplyr':
##
## collapse
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(cluster)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tightClust)
dim(NCI60$data)
## [1] 64 6830
head(NCI60$labs)
## [1] "CNS" "CNS" "CNS" "RENAL" "BREAST" "CNS"
table(NCI60$labs)
##
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA
## 7 5 7 1 1 6
## MCF7A-repro MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE
## 1 1 8 9 6 2
## RENAL UNKNOWN
## 9 1
Solution for Problem 1
Part 1
D60 <- NCI60$data
row_name <- c()
for (i in 1:length(NCI60$labs)) {
row_name <- c(row_name, paste(i, " ", NCI60$labs[i]))
}
rownames(D60) <- row_name
D60 <- as.data.frame(D60)
part_1 <- function(data=D60, dist = 'euclidean', linkage = 'single') {
if(dist=='pearson') dist_mat <- pearson.dist(data)
else dist_mat <- dist(data, method = dist)
output <- hclust(dist_mat, method = linkage)
plot(output, cex = 0.5, hang = 0.05, xlab = "Cell type",
lwd=1, ylab="Dist",
main=paste("clutster: ", dist, "and", linkage))
output
}
plot1 <- part_1(D60, "pearson", "complete")
plot2 <- part_1(D60, "pearson", "average")
plot3 <- part_1(D60, "pearson", "single")
Part 2
std_normal <- function(df){
mean_data <- apply(df, 2, mean)
std_data <- apply(df, 2, sd)
normalized_API <- function(data_vec) {
(data_vec - mean_data) / std_data
}
normalized_data <- t(apply(df, 1, normalized_API))
normalized_data
}
normalized_data <- std_normal(D60)
plot4 <- part_1(normalized_data, "pearson", "complete")
plot5 <- part_1(normalized_data, "pearson", "average")
plot6 <- part_1(normalized_data, "pearson", "single")
Part 3
quantl_normal <- function(df){
df_rank <- apply(df,2,rank,ties.method="min")
df_sorted <- data.frame(apply(df, 2, sort))
df_mean <- apply(df_sorted, 1, mean)
index_to_mean <- function(my_index, my_mean){
return(my_mean[my_index])
}
df_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
rownames(df_final) <- rownames(df)
return(df_final)
}
quantl_data <- quantl_normal(D60)
plot7 <- part_1(quantl_data, "pearson", "complete")
plot8 <- part_1(quantl_data, "pearson", "average")
plot9 <- part_1(quantl_data, "pearson", "single")
Part 4
From all the plots, 4&7 looks reasonbale for me.
clustig <- function(cluster=h1, k=4, pca=FALSE) {
sub_grp <- cutree(cluster, k = k)
plot(cluster, cex = 0.5, hang = 0.05, xlab = "Cell type",
lwd=1, ylab="Dist")
rect.hclust(cluster, k = k, border = 2:11)
if(pca) {
fviz_cluster(list(data = D60, cluster = sub_grp),
labelsize = 6)}
}
clustig(plot1, pca=T)
clustig(plot2, pca=T)
clustig(plot3, pca=T)
clustig(plot4, pca=T)
clustig(plot5, pca=T)
clustig(plot6, pca=T)
clustig(plot7, pca=T)
clustig(plot8, pca=T)
clustig(plot9, pca=T)
Part 5
Standardization should be performed, I think quantile normalization is reasonable, and the average linkage and complete linkage are the most suitable in this case.
Problem 2
pro2 <- function(data=D60, dist = 'euclidean', K=4, pca=F) {
if(dist=='pearson') dist_mat <- pearson.dist(data)
else dist_mat <- dist(data, method = dist)
set.seed(1)
k <- kmeans(data, centers=K, nstart = 20)
s <- silhouette(k$cluster, dist_mat)
if (pca) {
p <- fviz_cluster(k, data = data, labelsize = 7) +
ggtitle(paste("K=", K, "; Dist: ", dist))
list(plot=p, s_stat=s)
} else s
}
pro2(data=D60, dist = 'euclidean', K=4, pca=T)$p
pro2(data=normalized_data, dist = 'euclidean', K=4, pca=T)$p
pro2(data=quantl_data, dist = 'euclidean', K=4, pca=T)$p
Compare the three sets of clustering results from Problems 3.1, 3.2 and 3.3. Also compare them to the most reasonable clustering result you obtained in Problem 2.4.
Based on the results in Problem 3.4, please explain whether you think standardization should be performed for K -means, whether you think quantile normalization should be used before K -means, and which clustering method, K -means with Euclidean distance or hierarchical clustering with correlation distance, performs better in this application.
Solution for 4 and 5:
By comparing all the clustering results, the results do not give any differences. Based on the results in Problem 3.4, standardization should be performed for K -means, and the quantile normalization is not necessary to use before K -means, and for clustering method, K -means with Euclidean distance and hierarchical clustering with correlation distance, performs the same in this application.
Problem 3
Solution:
I choose value 6 for K based on the plots.
valu_k <- 2:8
pro3 <- function(k, data=D60) {
s <- pro2(data=data, dist = 'euclidean', K=k, pca=F)
mean(s[,3])}
avg_sil_values <- map_dbl(valu_k, pro3)
plot(valu_k, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes", main="Silouettes Kmeans")
set.seed(1)
fviz_nbclust(D60, FUN = kmeans, method = "silhouette")
gap_stat <- clusGap(D60, FUN = kmeans, nstart = 20, K.max = 7, B = 50)
fviz_gap_stat(gap_stat)
Problem 4
Solution:
The results are not very reasonable and more meaningful than the K -means clustering results.
TgClust <- tight.clust(D60, target=3, k.min=9, random.seed=1)
## Number of points: 64 Dimension: 6830
##
## Looking for tight cluster 1 ...
## k = 9
## k = 10
## 1 tight cluster(s) found!
## Cluster size: 7 Remaining number of points: 57
##
## Looking for tight cluster 2 ...
## k = 8
## k = 9
## 2 tight cluster(s) found!
## Cluster size: 7 Remaining number of points: 50
##
## Looking for tight cluster 3 ...
## k = 7
## k = 8
## 3 tight cluster(s) found!
## Cluster size: 7 Remaining number of points: 43
plot(TgClust, standardize.gene = TRUE)
Problem 6
Solution:
For 6.1: We got clusters (1,2) and (3,4).
d = as.dist(matrix(c(0, 0.3, 0.4, 0.7,
0.3, 0, 0.5, 0.8,
0.4, 0.5, 0.0, 0.9,
0.7, 0.8, 0.9, 0.0), nrow = 4))
plot(hclust(d, method = "complete"))
plot(hclust(d, method = "single"))
Problem 7
part 4
mu0 <- 0; mu1 = 2; sigma = 1; p = 0.5
z <- sample(c(0, 1), 500, replace = T, prob=c(p, 1-p))
x <- c()
for (i in 1:500) {
if (z[i]==0) x <- c(x, rnorm(1, mu0, sigma))
else x <- c(x, rnorm(1, mu1, sigma))
}
hist(x, breaks = 80, main="data")
p_hat <- mean(z)
mu0_hat <- sum(x*(1-z))/sum(1-z); mu1_hat <- sum(x*z)/sum(z)
sigma_hat <- (sum((x-mu1_hat)^2*z) + sum((x-mu0_hat)^2*(1-z))) / 500
print(paste(mu0_hat,mu1_hat,sigma_hat))
## [1] "-0.0335743224125288 2.07101869444064 1.0120808916937"
part 5
mu0_hat <- 0.5; mu1_hat <- 1.5; sigma0_hat <- 0.5; sigma1_hat <- 0.5
gamma_hat <- 0.3
loglike <- c()
for(i in 1:15) {
loglike <- c(
loglike, (
sum(
log(
gamma_hat * dnorm(
x, mean=mu1_hat, sd=sqrt(sigma1_hat)) + (1-gamma_hat) * dnorm(
x,mean=mu0_hat, sd=sqrt(sigma0_hat))))))
conditional_exp <- gamma_hat * dnorm(x, mean=mu1_hat, sd=sqrt(sigma1_hat)) /
(gamma_hat * dnorm(
x, mean=mu1_hat, sd=sqrt(sigma1_hat)) + (1-gamma_hat) * dnorm(
x, mean=mu0_hat, sd=sqrt(sigma0_hat)
)
)
gamma_hat <- mean(conditional_exp)
print(gamma_hat)
mu1_hat <- sum(x * conditional_exp) / sum(conditional_exp)
mu0_hat <- sum(x * (1-conditional_exp)) / sum(1-conditional_exp)
sigma1_hat <- sum(x^2 * conditional_exp) / sum(conditional_exp) - mu1_hat^2
sigma0_hat <- sum(
x^2 * (1-conditional_exp)) / sum(1-conditional_exp) - mu0_hat^2
print(paste("mu1: ", round(mu1_hat, 3), "; mu0: ", round(mu0_hat, 3)
)
)
print(
paste("sigma1: ", round(sigma1_hat, 3), "; sigma0: ", round(sigma0_hat, 3)
)
)
}
## [1] 0.4173923
## [1] "mu1: 2.257 ; mu0: 0.168"
## [1] "sigma1: 0.908 ; sigma0: 1.164"
## [1] 0.4187955
## [1] "mu1: 2.257 ; mu0: 0.163"
## [1] "sigma1: 0.88 ; sigma0: 1.176"
## [1] 0.418755
## [1] "mu1: 2.26 ; mu0: 0.161"
## [1] "sigma1: 0.87 ; sigma0: 1.173"
## [1] 0.418472
## [1] "mu1: 2.263 ; mu0: 0.159"
## [1] "sigma1: 0.864 ; sigma0: 1.17"
## [1] 0.4181479
## [1] "mu1: 2.266 ; mu0: 0.158"
## [1] "sigma1: 0.859 ; sigma0: 1.166"
## [1] 0.4178168
## [1] "mu1: 2.269 ; mu0: 0.158"
## [1] "sigma1: 0.856 ; sigma0: 1.164"
## [1] 0.4174849
## [1] "mu1: 2.271 ; mu0: 0.158"
## [1] "sigma1: 0.853 ; sigma0: 1.163"
## [1] 0.4171539
## [1] "mu1: 2.272 ; mu0: 0.158"
## [1] "sigma1: 0.851 ; sigma0: 1.161"
## [1] 0.4168244
## [1] "mu1: 2.274 ; mu0: 0.158"
## [1] "sigma1: 0.849 ; sigma0: 1.161"
## [1] 0.4164969
## [1] "mu1: 2.275 ; mu0: 0.158"
## [1] "sigma1: 0.847 ; sigma0: 1.16"
## [1] 0.4161718
## [1] "mu1: 2.276 ; mu0: 0.159"
## [1] "sigma1: 0.846 ; sigma0: 1.16"
## [1] 0.4158493
## [1] "mu1: 2.277 ; mu0: 0.159"
## [1] "sigma1: 0.845 ; sigma0: 1.16"
## [1] 0.4155297
## [1] "mu1: 2.278 ; mu0: 0.159"
## [1] "sigma1: 0.844 ; sigma0: 1.161"
## [1] 0.415213
## [1] "mu1: 2.279 ; mu0: 0.16"
## [1] "sigma1: 0.843 ; sigma0: 1.161"
## [1] 0.4148994
## [1] "mu1: 2.28 ; mu0: 0.161"
## [1] "sigma1: 0.843 ; sigma0: 1.161"
plot(
loglike, main="likelihood observed", type='l', lwd=3)